We showcase scMC’s capability of detecting context-shared and -specific biological signals by applying it to two mouse skin scRNA-seq datasets containing cells from control and Hedgehog (Hh) activation conditions during skin wound healing.

Load the required libraries

library(scMC)
library(Seurat)
library(patchwork)
library(dplyr)
library(ggplot2)

Load data

scMC takes a list of multiple digital data matrices as input. Features should be in rows and cells in columns. Rownames and colnames should be included.

The scRNA datasets we demonstrated here, including two count data matrices for the control and Hh activation conditions, can be downloaded via this shared Google Drive link.

load("/Users/suoqinjin/Documents/scMC_SeuratWrapper/tutorial/data_dermis.rda")
data.input <- data_dermis # a list of count data matrix, one per dataset
sample.name <- names(data.input)

Part I: Setup the a list of Seurat objects, one per dataset

object.list <- vector("list", length(sample.name))
names(object.list) <- sample.name
for (i in 1:length(object.list)) {
  # Initialize the Seurat object with the raw (non-normalized data) for each dataset
  x <- CreateSeuratObject(counts = data.input[[i]], min.cells = 3, min.features = 200, project = sample.name[i])
  # calculate mitochondrial QC metrics
  x[["percent.mt"]] <- PercentageFeatureSet(x, pattern = "^mt-")
  x$sample.name <- sample.name[i]
  x <- RenameCells(x, new.names = paste0(Cells(x), "_", x$sample.name))
  object.list[[i]] <-  x
  rm(x)
}
lapply(object.list, function(x) dim(x@assays$RNA@data))
#> $Control
#> [1] 17099  2878
#> 
#> $`Hh activation`
#> [1] 16119  1802

Step1. pre-processing each dataset

QC and selecting cells for further analysis

nFeature_RNA1 = c(7000, 7000); nCount_RNA1 = c(40000, 40000); percent.mt1 = c(10, 10)
for (i in 1:length(object.list)) {
  # Visualize QC metrics as a violin plot
  gg <- VlnPlot(object.list[[i]], features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size = 0.0001,cols=c("#a6cee3")) + geom_hline(yintercept = percent.mt1[i], linetype = 2)
  print(gg)
  cowplot::save_plot(filename=paste0("QC1_", sample.name[i],"_.pdf"), plot=gg, base_width = 7, base_height = 4.5)
 # VlnPlot(object.list[[i]], features = c("percent.mt"))+ geom_hline(yintercept = 15, linetype = 2)
  Sys.sleep(2)
  plot1 <- FeatureScatter(object.list[[i]], feature1 = "nCount_RNA", feature2 = "percent.mt", pt.size = 0.1,cols=c("black")) + geom_hline(yintercept = percent.mt1[i], linetype = 2)
  plot2 <- FeatureScatter(object.list[[i]], feature1 = "nCount_RNA", feature2 = "nFeature_RNA", pt.size = 0.1,cols=c("black")) + geom_hline(yintercept = nFeature_RNA1[i], linetype = 2)
  gg <- wrap_plots(plots = plot1, plot2, ncol = 2)
  print(gg)
  cowplot::save_plot(filename=paste0("QC2_", sample.name[i],"_.pdf"), plot=gg, base_width = 10, base_height = 3.5)
  object.list[[i]] <- subset(object.list[[i]], subset = (nFeature_RNA < nFeature_RNA1[i]) & (nCount_RNA < nCount_RNA1[i]) & (percent.mt < percent.mt1[i]))
}

lapply(object.list, function(x) dim(x@assays$RNA@data))
#> $Control
#> [1] 17099  2847
#> 
#> $`Hh activation`
#> [1] 16119  1792

Normalizing, scaling the data and feature selection

object.list <- lapply(X = object.list, FUN = function(x) {
  x <- NormalizeData(x, verbose = FALSE)
  x <- FindVariableFeatures(x, verbose = FALSE, nfeatures = 3000)
 #x <- FindVariableFeatures(x, selection.method = "mean.var.plot", verbose = FALSE, mean.cutoff = c(0.01, 5), dispersion.cutoff = c(0.25, Inf))
  # perform scaling on the previously identified variable features
  x <- ScaleData(x, verbose = FALSE)
})

Part II: Perform an integrated analysis using scMC

future::plan("multiprocess", workers = 4)

Step2. identify clusters with different resolution for each condition

# compute SNN
object.list <- identifyNeighbors(object.list)
#> Performing PCA in Dataset  1 
#> Building SNN on the PCA space in Dataset  1 
#> Performing PCA in Dataset  2 
#> Building SNN on the PCA space in Dataset  2
# identify clusters
object.list <- identifyClusters(object.list)
#> Identifying cell clusters in Dataset 1 
#> Peforming hierarchical clustering on the computed consensus matrix 
#> The initial guessed number of clusters is 17 
#> Identifying cell clusters in Dataset 2 
#> Peforming hierarchical clustering on the computed consensus matrix 
#> The initial guessed number of clusters is 15

Step3. detect cluster-specific cells with high confident

features.integration = identifyIntegrationFeatures(object.list)
object.list <- identifyConfidentCells(object.list, features.integration)

Step4. Identify marker genes associated with the putative cell clusters in each dataset

object.list <- identifyMarkers(object.list)
#> Identifying marker genes of cell clusters in Dataset 1
#> Calculating cluster 1
#> Calculating cluster 2
#> Calculating cluster 3
#> Calculating cluster 4
#> Calculating cluster 7
#> Calculating cluster 8
#> Calculating cluster 9
#> Calculating cluster 10
#> Calculating cluster 11
#> Calculating cluster 12
#> Identifying marker genes of cell clusters in Dataset 2
#> Calculating cluster 1
#> Calculating cluster 2
#> Calculating cluster 3
#> Calculating cluster 4
#> Calculating cluster 5
#> Calculating cluster 6
#> Calculating cluster 7
#> Calculating cluster 8
#> Calculating cluster 10
#> Calculating cluster 12

Step 5. Learn technical variation between any two datasets

structured_mat <- learnTechnicalVariation(object.list, features.integration)

Step 6. Learn a shared embedding of cells across all datasets after removing technical variation

combined <- merge(x = object.list[[1]],y = object.list[2:length(x = object.list)])
combined@meta.data <- combined@meta.data %>% select(-starts_with("RNA_snn_res"))
combined$sample.name <- factor(combined$sample.name, levels = sample.name)
VariableFeatures(combined) <- features.integration
combined <- integrateData(combined, structured_mat)

Part III: Run the standard workflow for visualization and clustering

Run the standard workflow for clustering

nPC = 40
combined <- FindNeighbors(combined, reduction = "scMC", dims = 1:nPC)
combined <- FindClusters(combined, algorithm = 4, resolution = 0.05)
levels(Idents(combined))
#> [1] "1" "2" "3" "4" "5" "6"
combined <- BuildClusterTree(combined, reorder = T, reorder.numeric = T, verbose = F)
combined <- RunUMAP(combined, reduction='scMC', dims = 1:nPC)

Quick visualization of cells onto the low-dimensional space

dimPlot(combined, reduction = "umap", group.by = c("sample.name","ident"))

### Quick exploration of the marker genes

combined <- ScaleData(combined, feature = rownames(combined), verbose = FALSE)
markers <- FindAllMarkers(combined, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
#> Calculating cluster 1
#> Calculating cluster 2
#> Calculating cluster 3
#> Calculating cluster 4
#> Calculating cluster 5
#> Calculating cluster 6
Misc(combined, slot = 'markers') <- markers
top10 <- markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
doHeatmap(combined, features=top10$gene, group.by='ident', additional.group.by = c('sample.name'))
#> Scale for 'fill' is already present. Adding another scale for 'fill', which
#> will replace the existing scale.

Visualization and annotation

Feature plot of known marker genes

features = c('Lox','Ptch1','Cd68','Pecam1','Myh11','Plp1')
gg <- featurePlot(combined, features = features, show.legend = F, show.axes = F)
#> Scale for 'colour' is already present. Adding another scale for 'colour',
#> which will replace the existing scale.
#> Scale for 'colour' is already present. Adding another scale for 'colour',
#> which will replace the existing scale.
#> Scale for 'colour' is already present. Adding another scale for 'colour',
#> which will replace the existing scale.
#> Scale for 'colour' is already present. Adding another scale for 'colour',
#> which will replace the existing scale.
#> Scale for 'colour' is already present. Adding another scale for 'colour',
#> which will replace the existing scale.
#> Scale for 'colour' is already present. Adding another scale for 'colour',
#> which will replace the existing scale.
gg <- patchwork::wrap_plots(plots = gg, ncol = 3)
gg

cowplot::save_plot(filename=paste0("overlayKnownMarkers_integration_dermis", "_umap.pdf"), plot=gg, base_width = 5.5, base_height = 6)

Annotation

new.cluster.ids <- c("Immune", "Hh-inactive Fib", "Hh-active Fib",  "Schwann", "Endotheial","Muscle")
names(new.cluster.ids) <- levels(combined)
combined <- RenameIdents(combined, new.cluster.ids)
new.order <- c("Hh-inactive Fib", "Hh-active Fib","Immune","Endotheial", "Muscle", "Schwann")
combined@active.ident <- factor(combined@active.ident, levels = new.order)

Visualize cells onto the low-dimensional space

p1 <- dimPlot(combined, reduction = "umap", group.by = "sample.name", colors.ggplot = T)
p2 <- dimPlot(combined, reduction = "umap", label = F)
gg <- patchwork::wrap_plots(p1, p2, ncol = 2)
gg

cowplot::save_plot(filename=paste0("integration_dermis", "_umap.pdf"), plot=gg, base_width = 8, base_height = 3)
# Split the plot into each dataset
dimPlot(combined, reduction = "umap", split.by = "sample.name",  combine = T)

Heatmap of marker genes

markers <- FindAllMarkers(combined, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
#> Calculating cluster Hh-inactive Fib
#> Calculating cluster Hh-active Fib
#> Calculating cluster Immune
#> Calculating cluster Endotheial
#> Calculating cluster Muscle
#> Calculating cluster Schwann
Misc(combined, slot = 'markers') <- markers
top10 <- markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
doHeatmap(combined, features=top10$gene, group.by='ident', additional.group.by = c('sample.name'))
#> Scale for 'fill' is already present. Adding another scale for 'fill', which
#> will replace the existing scale.

Violin plot

Stacked violin plot

features = c('Lox','Ptch1','Cd68','Pecam1','Myh11','Plp1')
gg <- StackedVlnPlot(combined, features = features)
gg

cowplot::save_plot(paste0("violin_markers", "_integration_dermis", ".pdf"), gg, base_height = 3.5, base_width = 2)

Splitted violin plot

features = c('Lox','Ptch1','Cd68','Pecam1','Myh11','Plp1')
gg <- StackedVlnPlot(combined, features = features, split.by = "sample.name")
gg

Violin plot with statistical test

features = c('Pdgfra','Lox','Ptch1','Gli1')
gg <- vlnPlot(combined, features = features, stat.add = T, comparisons = list(c("Hh-inactive Fib", "Hh-active Fib")))
patchwork::wrap_plots(plots = gg, ncol = 4)

Dot plot

dotPlot(combined, features =c('Lox','Ptch1','Cd68','Pecam1','Myh11','Plp1'))
#> Scale for 'colour' is already present. Adding another scale for 'colour',
#> which will replace the existing scale.

Splitted Dot plot

dotPlot(combined, features = c('Pdgfra','Lox','Ptch1','Gli1'), split.by = "sample.name", idents = c("Hh-inactive Fib", "Hh-active Fib"))

Part IV: Downstream analysis

combined$clusters.final <- Idents(combined)

Compute the proportion

gg1 <- computeProportion(combined, x = "clusters.final", fill = "sample.name")
gg2 <- computeProportion(combined, x = "sample.name", fill = "clusters.final", colors.use = scPalette(6))
gg1 + gg2

DEG analysis

markers.sample <- identifyDEG(combined, group.by = "sample.name")
#> Identify DEG using FindAllMarkers
#> Calculating cluster Control
#> Calculating cluster Hh activation
#> Scale for 'fill' is already present. Adding another scale for 'fill', which
#> will replace the existing scale.
#> Saving 3 x 4.5 in image

GO entichment analysis

res.go <- getEnrichedGO(markers, category = "BP", do.simplify = F, idents = c("Hh-inactive Fib", "Hh-active Fib"))
#> Scale for 'colour' is already present. Adding another scale for 'colour',
#> which will replace the existing scale.
#> Saving 6 x 4.5 in image

Save data

combined <- ScaleData(combined)
combined$clusters.final <- Idents(combined)
save(combined, file = "scMC_dermis_SeuratV4.RData")
# combined.loom <- as.loom(combined, filename = "scMC_dermis_SeuratV4.loom", verbose = FALSE)
# combined.loom$close_all()
write.table(GetAssayData(combined),file = "preprocessedData_integration_dermis.txt",sep = '\t')
write.table(combined@meta.data,file = "metaData_integration_dermis.txt",sep = '\t')
write.table(Idents(combined),file = "identity_integration_dermis.txt",sep = '\t')
write.table(Embeddings(combined, "scMC"),file = "integratedSpace_scMC_integration.txt",sep = '\t')
write.table(combined[["umap"]]@cell.embeddings,file = "projectedData_umap_integration_dermis.txt",sep = '\t')
write.table(markers,file = "markers_integration_dermis.txt",sep = '\t')
write.table(top10,file = "markersTop10_integration_dermis.txt",sep = '\t')

# combined.loom <- loomR::connect(filename = "scMC_dermis_SeuratV4.loom", mode = "r")
# combined <- as.Seurat(combined.loom)